'''
	实现卷积神经网络

'''
from keras.models import Sequential
from keras.layers import Dense,Activation
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Input
from keras.layers import convolutional as conv
from keras.layers import MaxPooling2D
from text2vec import text2vec
from keras.utils import to_categorical
from pickle import load,dump

train_file='hotel_corpus/hotel_train.txt'
test_file='hotel_corpus/hotel_dev.txt'
pickle_file='rtdata/train_dev_data.bin'

def CNNs():
	'''
		实现卷积神经网络
	'''
	first_time_run=False
	if first_time_run is True:
		#训练数据初始化
		train_label,input_train=text2vec(train_file)
		input_size=len(input_train)# 4359
		input_train=input_train.reshape(input_size,30,100,1)
		output_train=to_categorical(train_label[0:input_size],4)
		#开发数据初始化
		test_label,input_test=text2vec(test_file)
		test_size=len(input_test)
		input_test=input_test.reshape(test_size,30,100,1)
		output_test=to_categorical(test_label[0:test_size],4)
		
		#数据持久化
		with open(pickle_file,'wb') as data:
			dump(input_train,data)
			dump(output_train,data)
			dump(input_test,data)
			dump(output_test,data)
	else:
		with open(pickle_file,'rb') as data:
			input_train=load(data)
			output_train=load(data)
			input_test=load(data)
			output_test=load(data)
	#参数初始化
	inputshape= (30,100,1) 
	kernel=(5,5)
	poolsize=(3,3)

	print('#初始化序列模型')
	model=Sequential()

	print('#添加层')
	model.add(conv.Conv2D(filters=1,kernel_size=kernel,
		padding='same',input_shape=inputshape
		 ))
	model.add(Activation("relu") )
	model.add(MaxPooling2D(pool_size=poolsize))
	model.add(Dropout(0.2))  #随机失活20%

	model.add(conv.Conv2D(filters=1,kernel_size=kernel,padding='same'))
	model.add(Activation("relu") )
	model.add(Dropout(0.2) )

	model.add(conv.Conv2D(filters=1,kernel_size=kernel,padding='same'))
	model.add(Activation("relu") )
	model.add(Dropout(0.2) )

	model.add(conv.Conv2D(filters=1,kernel_size=kernel,padding='same'))
	model.add(Activation("relu") )
	model.add(Dropout(0.2) )

	model.add(MaxPooling2D(pool_size=poolsize))
	

	model.add(Flatten() )	#从卷积到全连接的过渡

	model.add(Dense(4) )
	model.add( Activation('softmax') )

	print('#编译模型')
	#loss为损失函数，optimizer为优化器
	model.compile(loss='categorical_crossentropy',
		optimizer='sgd', 
		metrics=['accuracy'])

	print('#训练模型')
	#设置batch大小
	model.fit(input_train, output_train, epochs=50, batch_size=32
		   ,verbose=2
		   )
	#verbose:日志输出1：进度条，2：每轮一次
	#batch大小为1
	#model.train_on_batch(x_batch, y_batch)

	print('#评估模型')
	model.summary()  #打印模型概况
	score = model.evaluate(input_test, output_test, batch_size=32
			,verbose=2
			)
	print('测试分数：',score[0])
	print('测试精度：',score[1])
	#或者对新的数据进行预测
	#classes = model.predict(x_test, batch_size=128)

	'''
	model.add(Dense(units=64,input_shape=inputshape ) )
	model.add(Activation("relu") )

	config=model.get_config()  #包含配置信息的python字典
	model.get_layer()  #由层名和下标获得层对象
	model.get_weights()  #获得权重
	model.set_weights()  #设置权重
	model.to_json()  #返回代表模型的json字符串
	'''

def test():
	CNNs()

test()






'''
			自编解码器：自监督学习算法
				数据的压缩和解压函数：数据集相关，有损，自动学习
			自编码器作数据压缩和JPEG比没有性能优势
			自编码器的应用：1、数据去燥，2、降维
			
				
'''
from keras.models import Model
from keras.layers import Dense,Activation
from keras.layers import Input
from keras.layers.recurrent  import LSTM,GRU
from data_predeal import PICKLE_FILE,LENGTH,read_3d
import numpy as np
np.random.seed(1234)

def build():
	start=Input(shape=(40,100))
	encoder=LSTM(32,return_sequences=True,activation=act  )(start)
	encoder=LSTM(16,return_sequences=True,activation=act  )(encoder)
	encoder=LSTM(8,return_sequences=True,activation=act  )( encoder)
	
	decode=LSTM(16,return_sequences=True,activation=act  )(encoder)
	decode=LSTM(32,return_sequences=True,activation=act  )(decoder)
	end=LSTM(4000,return_sequences=True,activation=act  )(decoder)
	
	model=Model(inputs=start,outputs=end)
	return model
	
def autocoder():
	input_train,output_train,input_dev,output_dev=read_3d(test=False)
	model=build()
	model.compile(loss='categorical_crossentropy',optimizer='adam', metrics=[])
	model.summary()
	model.fit(input_train, output_train, epochs=150, batch_size=64,verbose=2
			,validation_data=(input_dev,output_dev),callbacks=[])
	model.save(model_file)
	
	
	




'''
			使用支持向量机进行意见分类
				对比神经网络的效果。
'''

from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from pickle import load
import numpy as np
np.random.seed(1432)


pickle_file1='rtdata/train_dev_test40.bin'

def read_data(train=True):
	LENGTH=40
	#从文件中顺序获得对象
	with open(pickle_file1,'rb') as data:
		input_train=load(data)
		output_train=load(data)
		input_dev=load(data)
		output_dev=load(data)
		input_test=load(data)
		output_test=load(data)
	#输入数据进行规格化处理
	size=len(input_train)
	input_train=input_train.reshape(size,LENGTH*100)
	size0=len(input_dev)
	input_dev=input_dev.reshape(size0,LENGTH*100)
	size1=len(input_test)
	input_test=input_test.reshape(size1,LENGTH*100)
	#输出标签格式化
	output_train=np.argmax(output_train,axis=-1)
	output_dev=np.argmax(output_dev,axis=-1)
	output_test=np.argmax(output_test,axis=-1)
	print('三个集合的大小:',size,size0,size1)
	#根据用途选择返回值
	if train is True:
		return input_train,output_train,input_dev,output_dev
	else:
		return input_test,output_test
		
		
def svm_used():
	'''
		1、将数据导入支持向量机
		2、ValueError: Found array with dim 3. Estimator expected <= 2.  : 输入的是向量
		3、参数介绍：
		SVC(C=1.0, kernel=’rbf’, degree=3, gamma=’auto’, coef0=0.0, shrinking=True,
				 probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, 
				 max_iter=-1, decision_function_shape=’ovr’, random_state=None）
			C\误差项的惩罚参数，一般取值为10的n次幂，如10的-5次幂
			kernel：linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’
			degree ：poly核，多项式的最高次数，默认为三次多项式
			tol：误差项达到指定值时则停止训练，默认为1e-3，即0.001。
			probability：是否采用概率估计。必须在fit（）方法前使用
			verbose：是否启用详细输出。
			max_iter:强制设置最大迭代次数。默认设置为-1，表示无穷大迭代次数
			
	'''
	input_train,output_train,input_dev,output_dev=read_data()
	model = OneVsRestClassifier( SVC(kernel='linear',verbose=True,probability=False)  )
	classifier=model.fit(input_train,output_train)
	
	rate=classifier.score(input_dev,output_dev)
	print('模型的平均争取率：',rate*100)
	'''
	二分类结果：0.39903846153846156
	多分类结果：0.7115384615384616  ：kernel='rbf'
							
	'''
	input_test,output_test=read_data(train=False)
	pred=classifier.predict(input_test)
	print(type(pred))
	targetnames = ['原因', '细节', '建议','条件']
	print('多分类报告:')
	print(classification_report(output_test, pred, target_names=targetnames))
	print('混淆矩阵:')
	con_mat=confusion_matrix(output_test, pred)   #混淆矩阵:行为真实类别,列为预测类别
	print(con_mat)
	
svm_used()
